//-
// ==========================================================================
// Copyright (C) 1995 - 2006 Autodesk, Inc. and/or its licensors.  All 
// rights reserved.
//
// The coded instructions, statements, computer programs, and/or related 
// material (collectively the "Data") in these files contain unpublished 
// information proprietary to Autodesk, Inc. ("Autodesk") and/or its 
// licensors, which is protected by U.S. and Canadian federal copyright 
// law and by international treaties.
//
// The Data is provided for use exclusively by You. You have the right 
// to use, modify, and incorporate this Data into other products for 
// purposes authorized by the Autodesk software license agreement, 
// without fee.
//
// The copyright notices in the Software and this entire statement, 
// including the above license grant, this restriction and the 
// following disclaimer, must be included in all copies of the 
// Software, in whole or in part, and all derivative works of 
// the Software, unless such copies or derivative works are solely 
// in the form of machine-executable object code generated by a 
// source language processor.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. 
// AUTODESK DOES NOT MAKE AND HEREBY DISCLAIMS ANY EXPRESS OR IMPLIED 
// WARRANTIES INCLUDING, BUT NOT LIMITED TO, THE WARRANTIES OF 
// NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
// PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE, OR 
// TRADE PRACTICE. IN NO EVENT WILL AUTODESK AND/OR ITS LICENSORS 
// BE LIABLE FOR ANY LOST REVENUES, DATA, OR PROFITS, OR SPECIAL, 
// DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES, EVEN IF AUTODESK 
// AND/OR ITS LICENSORS HAS BEEN ADVISED OF THE POSSIBILITY 
// OR PROBABILITY OF SUCH DAMAGES.
//
// ==========================================================================
//+

#include <math.h>

#include <maya/MIOStream.h>
#include <maya/MSimple.h>
#include <maya/MTimer.h>
#include <maya/MGlobal.h>
#include <maya/MThreadAsync.h>
#include <maya/MThreadPool.h>
#include <maya/MSpinLock.h>
#include <maya/MMutexLock.h>

DeclareSimpleCommand( threadTestWithLocksCmd, PLUGIN_COMPANY, "2008");


#define NUM_TASKS	        32
#define NUM_ASYNC_TASKS	13

MThreadRetVal Pi(void *data);


typedef struct _threadDataTag
{
	int threadNo;
	double pi, step, *globalPi;
	int start, end;
	MSpinLock *spinLock;
	MMutexLock *mutexLock;

}threadData;

typedef struct _taskDataTag
{
	int      iterations;
	double    totalPi;

}taskData;

// Pi computation. This function is called from a single thread
double CalcPiSingleThreaded(int iterations)
{
	double pi = 0;
	double step = 1.0/double(iterations);
	for( int i = 0; i < iterations; i++ )
	{
		double x = step * ((double)i-0.5);
		pi += 4.0 / (1.0 + x*x);
	}
	pi /= double(iterations);
	return pi;
}

// Pi computation. This function is called from multiple threads
MThreadRetVal Pi(void *data)
{
	threadData *myData = (threadData *)data;
	double pi = 0;

	for( int i = myData->start; i < myData->end; i++ )
	{
		double x = myData->step * ((double)i-0.5);
		pi += 4.0 / (1.0 + x*x);
	}
	myData->pi = pi;

	return (MThreadRetVal)0;
}

// Function to create thread tasks
void DecomposePi(void *data, MThreadRootTask *root)
{
	taskData *taskD = (taskData *)data;

	int iterationsPerTask = taskD->iterations/NUM_TASKS;
	int limit;

	threadData tdata[NUM_TASKS];

	if( tdata )
	{
		double step = 1.0f/taskD->iterations;

		for( int i = 0; i < NUM_TASKS; ++i )
		{
			limit                = (i+1)*iterationsPerTask;
			tdata[i].threadNo    = i;
			tdata[i].pi          = 0;
			tdata[i].start       = i*iterationsPerTask;
			limit                = tdata[i].start + iterationsPerTask;
			tdata[i].end         = ( limit < taskD->iterations) ? limit : taskD->iterations;
			tdata[i].step        = step;

			MThreadPool::createTask(Pi, (void *)&tdata[i], root);
		}

		MThreadPool::executeAndJoin(root);

		for( int i = 0; i < NUM_TASKS; ++i )
		{
			taskD->totalPi += tdata[i].pi;
		}
	}
}

// Set up and tear down parallel tasks
double CalcPi(int iterations)
{
	double              Pi = 0;
	taskData            tdata;

	tdata.iterations = 0; tdata.totalPi = 0;
	
	MStatus pr = MThreadPool::init();
	if( MStatus::kSuccess == pr )
	{
		tdata.iterations  = iterations;
		tdata.totalPi     = 0;
		
		MThreadPool::newParallelRegion(DecomposePi, (void *)&tdata);
		
		Pi = tdata.totalPi/iterations;
		MThreadPool::release();
	}
	MThreadPool::release();

	return Pi;
}

// Pi computation with spin-lock. This function is called from multiple threads
MThreadRetVal SpinSyncPi(void *data)
{
	threadData *myData = (threadData *)data;
	double pi = 0;

	for( int i = myData->start; i < myData->end; i++ )
	{
		double x = myData->step * ((double)i-0.5);
		pi += 4.0 / (1.0 + x*x);
	}
	myData->spinLock->lock();
	(*myData->globalPi) += pi;
	myData->spinLock->unlock();

	return 0;
}

// Function to create thread tasks
void DecomposeSpinSyncPi(void *data, MThreadRootTask *root)
{
	taskData *taskD = (taskData *)data;

	int iterationsPerTask = taskD->iterations/NUM_TASKS;
	int limit;
	MStatus pr = MStatus::kSuccess;

	MSpinLock* spinLock = new MSpinLock();
	if( MStatus::kSuccess == pr )
	{

		threadData tdata[NUM_TASKS];

		if( tdata )
		{
			double step = 1.0f/taskD->iterations;

			for( int i = 0; i < NUM_TASKS; ++i )
			{
				limit                = (i+1)*iterationsPerTask;
				tdata[i].threadNo    = i;
				tdata[i].pi          = 0;
				tdata[i].globalPi    = &taskD->totalPi;
				tdata[i].start       = i*iterationsPerTask;
				limit                = tdata[i].start + iterationsPerTask;
				tdata[i].end         = ( limit < taskD->iterations) ? limit : taskD->iterations;
				tdata[i].step        = step;
				tdata[i].spinLock       = spinLock;

				MThreadPool::createTask(SpinSyncPi, (void *)&tdata[i], root);
			}

			MThreadPool::executeAndJoin(root);

			for( int i = 0; i < NUM_TASKS; ++i )
			{
				taskD->totalPi += tdata[i].pi;
			}
		}
	}
	//
	// Free resources and delete lock
	//
	delete spinLock;
}

// Pi computation with mutex lock. This function is called from multiple threads
MThreadRetVal MutexSyncPi(void *data)
{
	threadData *myData = (threadData *)data;
	double pi = 0;

	for( int i = myData->start; i < myData->end; i++ )
	{
		double x = myData->step * ((double)i-0.5);
		pi += 4.0 / (1.0 + x*x);
	}
	myData->mutexLock->lock();
	(*myData->globalPi) += pi;
	myData->mutexLock->unlock();

	return 0;
}

// Function to create thread tasks
void DecomposeMutexSyncPi(void *data, MThreadRootTask *root)
{
	taskData *taskD = (taskData *)data;

	int iterationsPerTask = taskD->iterations/NUM_TASKS;
	int limit;
	MStatus pr = MStatus::kSuccess;

	MMutexLock* mutexLock = new MMutexLock();
	if( MStatus::kSuccess == pr )
	{

		threadData tdata[NUM_TASKS];

		if( tdata )
		{
			double step = 1.0f/taskD->iterations;

			for( int i = 0; i < NUM_TASKS; ++i )
			{
				limit                = (i+1)*iterationsPerTask;
				tdata[i].threadNo    = i;
				tdata[i].pi          = 0;
				tdata[i].globalPi    = &taskD->totalPi;
				tdata[i].start       = i*iterationsPerTask;
				limit                = tdata[i].start + iterationsPerTask;
				tdata[i].end         = ( limit < taskD->iterations) ? limit : taskD->iterations;
				tdata[i].step        = step;
				tdata[i].mutexLock       = mutexLock;

				MThreadPool::createTask(MutexSyncPi, (void *)&tdata[i], root);
			}

			MThreadPool::executeAndJoin(root);

			for( int i = 0; i < NUM_TASKS; ++i )
			{
				taskD->totalPi += tdata[i].pi;
			}
		}
	}
	//
	// Free resources and delete lock
	//
	delete mutexLock;
}

// Set up and tear down parallel tasks
double CalcSpinSyncPi(int iterations)
{
	double              Pi = 0;
	taskData            tdata;
	MStatus pr = MThreadPool::init();
	if( MStatus::kSuccess == pr )
	{
		tdata.iterations  = iterations;
		tdata.totalPi     = 0;
		
		MThreadPool::newParallelRegion(DecomposeSpinSyncPi, (void *)&tdata);

		Pi = tdata.totalPi/iterations;
		MThreadPool::release();
	}
	MThreadPool::release();
		
    return Pi;
}

// Set up and tear down parallel tasks
double CalcMutexSyncPi(int iterations)
{
	double              Pi = 0;
	taskData            tdata;
	MStatus pr = MThreadPool::init();
	if( MStatus::kSuccess == pr )
	{
		tdata.iterations  = iterations;
		tdata.totalPi     = 0;
		
		MThreadPool::newParallelRegion(DecomposeMutexSyncPi, (void *)&tdata);

		Pi = tdata.totalPi/iterations;
		MThreadPool::release();
	}
	MThreadPool::release();

    return Pi;
}

// Compute pi. This function is called from multiple asynchonous threads
MThreadRetVal AsyncPi(void *data)
{
	threadData *myData = (threadData *)data;

	for( int i = myData->start; i < myData->end; i++ )
    {
        double x = myData->step * ((double)i-0.5);
		myData->pi += 4.0 / (1.0 + x*x);
    }
	return 0;
}

// variable to track threads. As each thread finishes it's work it
// increments this variable. The main thread will wait until this
// variable is equal to the thread count, meaning all threads have
// completed their work variable will equal the threadcount and the
static volatile int g_async_count = 0; 

static MSpinLock asyncSpinLock;

// increment thread completion variable. Uses a lock to prevent race
// conditions where two threads attempt to update the variable
// simultaneously
void AsyncCB(void *data)
{
	asyncSpinLock.lock();
	g_async_count++;
	asyncSpinLock.unlock();
}

static MSpinLock exchangeSpinLock;

// test if variable matches the expected value. Locks required to
// ensure threadsafe access to variables
bool Maya_InterlockedCompare(volatile int* variable, int compareValue)
{
	exchangeSpinLock.lock();
	bool rtn = (*variable == compareValue);
	exchangeSpinLock.unlock();
	return rtn;
}

// Barrier function. Main thread enters here and polls the count
// variable until all worker threads have indicated they have
// completed by incrementing this count.
void WaitForAsyncThreads(int val)
{
	while( !Maya_InterlockedCompare(&g_async_count, val)) {
#if defined(OSWin_)
		Sleep(0);
#else
		sleep(0);
#endif
	}
}

// Set up and tear down asynchronous thread tasks
double CalcAsyncPi(int iterations)
{
    double              Pi = 0;

    g_async_count = 0;
	MStatus pr = MThreadAsync::init();
	if( MStatus::kSuccess == pr )
    {
		int iterationsPerTask = iterations/NUM_ASYNC_TASKS;
		int limit;

		threadData tdata[NUM_ASYNC_TASKS];
		
		double step = 1.0f/iterations;
		
		for( int i = 0; i < NUM_ASYNC_TASKS; ++i )
        {
			limit                = (i+1)*iterationsPerTask;
			tdata[i].threadNo    = i;
			tdata[i].pi          = 0;
			tdata[i].start       = i*iterationsPerTask;
			limit                = tdata[i].start + iterationsPerTask;
			tdata[i].end         = ( limit < iterations) ? limit : iterations;
			tdata[i].step        = step;
			
			// start threads. Each thread makes a call to AsyncCB
			// when completed which increments a counter. The wait
			// function below waits until all threads have completed
			// and incremented this counter.
			pr = MThreadAsync::createTask(AsyncPi, (void *)&tdata[i], 
										  AsyncCB, NULL);
			if( pr != MStatus::kSuccess )
            {
				return 0;
			}
		}

		// barrier here, waits until all threads have completed before continuing
		WaitForAsyncThreads(NUM_ASYNC_TASKS);
		
		// accumulate sum from each thread
		for( int i = 0; i < NUM_ASYNC_TASKS; ++i )
        {
			Pi += tdata[i].pi;
		}
		Pi /= iterations;
		MThreadAsync::release(); // release async thread
	}
	MThreadAsync::release(); // shut down threads

    return Pi;
}

// MSimple command that invokes the serial and parallel thread calculations
MStatus threadTestWithLocksCmd::doIt( const MArgList& args )
{
	MString str = MString("Computation of pi using the Maya API\n");
	MGlobal::displayInfo(str);

	if(args.length() != 1) {
		MString str = MString("Invalid number of arguments, supply iteration count, usage: threadTestWithLocksCmd 100000");
		MGlobal::displayError(str);
		return MStatus::kFailure;
	}

	MStatus stat;

	int iterations = args.asInt( 0, &stat );
	if ( MS::kSuccess != stat ) {
		MString str = MString("Invalid number of arguments, supply iteration count, usage: threadTestWithLocksCmd 100000");
		MGlobal::displayError(str);
		return MStatus::kFailure;
	}

	double pi = 0.0;
	MString piStr;
	double elapsedTime = 0.0;
	MTimer timer;

	// run single threaded
	timer.beginTimer();
	pi = CalcPiSingleThreaded(iterations);
	timer.endTimer();
	elapsedTime = timer.elapsedTime();
	piStr.set(pi, 10); // 10 digits of precision
	str = MString("Unthreaded computation, pi = ") + piStr + MString(" calculated in ") + elapsedTime + MString("s");
	MGlobal::displayInfo(str);

	// run single threaded
	timer.beginTimer();
	pi = CalcPi(iterations);
	timer.endTimer();
	elapsedTime = timer.elapsedTime();
	piStr.set(pi, 10); // 10 digits of precision
	str = MString("Threaded computation, pi = ") + piStr + MString(" calculated in ") + elapsedTime + MString("s");
	MGlobal::displayInfo(str);

	// run threaded with spin-lock synchronisation
	timer.beginTimer();
	pi = CalcSpinSyncPi(iterations);
	timer.endTimer();
	elapsedTime = timer.elapsedTime();
	piStr.set(pi, 10); // 10 digits of precision
	str = MString("Threaded computation with spin-lock synchronization, pi = ") + piStr + MString(" calculated in ") + elapsedTime + MString("s");
	MGlobal::displayInfo(str);

	// run threaded with mutex synchronisation
	timer.beginTimer();
	pi = CalcMutexSyncPi(iterations);
	timer.endTimer();
	elapsedTime = timer.elapsedTime();
	piStr.set(pi, 10); // 10 digits of precision
	str = MString("Threaded computation with mutex synchronization, pi = ") + piStr + MString(" calculated in ") + elapsedTime + MString("s");
	MGlobal::displayInfo(str);

	// run async threaded
	timer.beginTimer();
	pi = CalcAsyncPi(iterations);
	timer.endTimer();
	elapsedTime = timer.elapsedTime();
	piStr.set(pi, 10); // 10 digits of precision
	str = MString("Threaded computation with async thread, pi = ") + piStr + MString(" calculated in ") + elapsedTime + MString("s\n");
	MGlobal::displayInfo(str);

	return MStatus::kSuccess;
}
